import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',100)
df = pd.read_excel('HR_Employee_Attrition Dataset.xlsx')
df.head()
df.shape
df.info()
df.Attrition.value_counts(normalize=True)
df.isnull().sum()
df.JobInvolvement.value_counts()
df.describe().transpose()
sns.set_style("darkgrid")
f, axes = plt.subplots(2,4,figsize=(30,15))
sns.distplot(df.Age,ax=axes[0,0]);
sns.distplot(df.DailyRate,ax=axes[0,1]);
sns.distplot(df.TotalWorkingYears,ax=axes[0,2]);
sns.distplot(df.YearsSinceLastPromotion,ax=axes[0,3]);
sns.distplot(df.YearsAtCompany,ax=axes[1,0]);
sns.distplot(df.YearsSinceLastPromotion,ax=axes[1,1]);
sns.distplot(df.YearsWithCurrManager,ax=axes[1,2]);
sns.distplot(df.StandardHours,ax=axes[1,3]);
sns.set_style("darkgrid")
f, axes = plt.subplots(2,3,figsize=(30,15))
sns.countplot(df.BusinessTravel,ax=axes[0,0]);
sns.countplot(df.Department,ax=axes[0,1]);
sns.countplot(df.EducationField,ax=axes[0,2]);
sns.countplot(df.Gender,ax=axes[1,0]);
chart = sns.countplot(df.JobRole,ax=axes[1,1]);
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
sns.countplot(df.MaritalStatus,ax=axes[1,2]);
sns.scatterplot( x= "Age", y = "DailyRate" ,data=df, hue = 'Attrition')
sns.pairplot(df, hue='Attrition');
df.drop(columns = ['EmployeeNumber'],inplace=True)
df.drop(columns = ['StandardHours'],inplace=True)
plt.figure(figsize = (20,15))
sns.heatmap(df.corr(),annot=True,cmap='Blues')
sns.scatterplot( x= "JobLevel", y = "MonthlyIncome",data=df)
# Create correlation matrix
def drop_corr(df):
corr_matrix = df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find features with correlation greater than 0.7
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
# Drop features
df1 = df.drop(to_drop, axis=1)
return df1
df = drop_corr(df)
df.info()
categorical_features = list(df.select_dtypes(include=['O']).columns)
categorical_features
## converting categorical feature to 'Category type'
for column in categorical_features:
df[column] = df[column].astype('category')
# select the categorical columns
category_columns = df.select_dtypes(['category']).columns
# convert each columns to code
df[category_columns] = df[category_columns].apply(lambda x: x.cat.codes)
df.head()
y = df['Attrition']
X = df.drop(['Attrition'],axis=1)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cat_list = category_columns.to_list()
cat_list.remove('Attrition')
cat_list
# numerical_columns = X.drop(cat_list,axis=1).columns.tolist()
# X[numerical_columns] = scaler.fit_transform(X[numerical_columns])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
from sklearn.model_selection import cross_val_score,KFold
k = KFold(random_state=7,n_splits=5)
from sklearn.pipeline import Pipeline
model= []
tr = []
te = []
f1 = []
auc = []
from sklearn.metrics import f1_score, roc_auc_score
# Logistic Regression
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
('scaler',StandardScaler()),
('clf', LogisticRegression())
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
model.append('Logistic Regression')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([
('scaler',StandardScaler()),
('clf', DecisionTreeClassifier(random_state=7))
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
model.append('Decision Tree')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
# Random Forest
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
('scaler',StandardScaler()),
('clf', RandomForestClassifier(random_state=7))
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
model.append('Random Forest')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
# Bagging
from sklearn.ensemble import BaggingClassifier
pipeline = Pipeline([
('scaler',StandardScaler()),
('clf', BaggingClassifier(random_state=7))
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
model.append('Bagging')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
pipeline = Pipeline([
('scaler',StandardScaler()),
('reg', AdaBoostClassifier(random_state=7))
])
pipeline.fit(X_train,y_train)
model.append('AdaBoost')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
pipeline = Pipeline([
('scaler',StandardScaler()),
('clf', GradientBoostingClassifier(random_state=7))
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
model.append('Gradient Boosting')
tr.append(pipeline.score(X_train,y_train))
te.append(cross_val_score(pipeline, X_test, y_test, cv=k).mean())
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
# DataFrame to compare results.
results = pd.DataFrame()
results['Model'] = model
results['Training Score'] = tr
results['Testing Score'] = te
results['F1 Score'] = f1
results['ROC/AUC Score'] = auc
results = results.set_index('Model')
results.sort_values('Testing Score', ascending=False)
ss = StandardScaler()
X_train = pd.DataFrame(ss.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(ss.transform(X_test),columns=X_test.columns)
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier(random_state=7)
params = {
'bootstrap': [True,False],
'max_depth': [3,4,None],
'max_features': ['sqrt','log2'],
'min_samples_leaf': [1,3, 4],
'min_samples_split': [2,3,5],
'n_estimators': [25,50,'warn']
}
grid = GridSearchCV(estimator = rf, param_grid = params, cv = k)
grid.fit(X_train,y_train)
model = ['Random forest after Grid search']
y_pred = grid.predict(X_test)
tr = [grid.score(X_train,y_train)]
te = [grid.score(X_test,y_test)]
f1 = [f1_score(y_test,y_pred)]
auc = [roc_auc_score(y_test,y_pred)]
gb = GradientBoostingClassifier(random_state=7)
gb.fit(X_train,y_train)
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50 , stop = 150, num = 25)] # returns evenly spaced 25 numbers
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 10, num = 5)] # returns evenly spaced numbers can be changed to any
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,3,4,5,6,7,8,9,10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4]
# Method of selecting samples for training each tree
learning_rate = [float(x) for x in np.linspace(0.1, 1, num = 10)]
# Create the random grid
params_r = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'learning_rate':learning_rate}
# Use the random grid to search for best hyperparameters
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
random = RandomizedSearchCV(estimator=gb, param_distributions=params_r,cv = k, random_state=7)
# Fit the random search model
random.fit(X_train, y_train)
random.best_params_
model.append('Gradient Boosting after RandomCV')
y_pred = random.predict(X_test)
tr.append(random.score(X_train,y_train))
te.append(random.score(X_test,y_test))
f1.append(f1_score(y_test,y_pred))
auc.append(roc_auc_score(y_test,y_pred))
model,tr,te,f1,auc
results = pd.DataFrame()
results['Model'] = model
results['Training Score'] = tr
results['Testing Score'] = te
results['F1 Score'] = f1
results['ROC/AUC Score'] = auc
results = results.set_index('Model').sort_values('Testing Score',ascending=False)
results